*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* Replication Files to: P-hacking in clinical trials and how incentives shape the distribution of results across phases 
* Date: April 2020
* Authors: Jérôme Adda, Christian Decker, and Marco Ottaviani
*
*	- Input: 'linked_phases.dta', 'data_counterfactual_analysis_sec.dta'
* 			 		 
*	- Output: 'data_selection_functions_sec.dta' & Table S6
*			 
* Topic: Estimate 'Placebo' Selection Functions based on Secondary Outcomes
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

*------------------------*
* Section 0 -- Directory *
*------------------------*

clear all
clear matrix
cap log close
set more off

*-----*
* 0.1 *
*-----*

* Run the dofiles which defines all the globals for the folder's and sub-folders
* paths to both data and analysis.


if c(username) == "chrdec" {													
	do "C:/Users/chrdec/Dropbox/clinical trials/stata/PNAS revision/2_analysis/dofiles/0_globals.do"   			// 0_globals.do path

}


*-----*
* 0.2 * 
*-----*

* Set up the directory where data are stored
macro list
cd "${data}"
set matsize 11000

cap log off
log using "${run}${log}log_7b.txt", replace

*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~

*-------------------*
* Table of content: *
*-------------------*
* Sections: 
* 1. Prepare Data
* 2. Estimate and Tabulate 


*---------------------------*
* Section 1 -- Prepare Data *
*---------------------------*


use "${data}${final}linked_phases.dta", clear
keep if Ph2==1
keep nct_id matched

merge 1:m nct_id using "${data}${final}data_counterfactual_analysis_sec.dta", keepusing(nct_id INDUSTRY p z z_modifier D0 D1 D2 sqrt_enrollment top10rev placebo Ph2 completion_year mesh_code rev_rank rxsales_rank rdspend_rank rep_rank)
keep if _merge==3
drop _merge
keep if Ph2==1

bysort nct_id: gen N=_N

** generate dummies for completioan year and mesh category
tab completion_year
levelsof completion_year if completion_year>2006 & completion_year<2019, local(cys) // omitted category: before 2007
foreach y of local cys {
	gen cy_`y'=0
 	replace cy_`y'=1 if completion_year==`y'
}
replace cy_2018=1 if completion_year>2018 // cy_2018 means 2018 or later


tab mesh_code
gen mesh_aux=subinstr(mesh_code,"/","_",10)
levelsof mesh_aux if mesh_code!="C01" & mesh_code!="C02" & mesh_code!="C03" & mesh_code!="C11" & mesh_code!="C15" & mesh_code!="C24" & mesh_code!="C26" & mesh_code!="missing", local(code) clean // omitted category: missing/others (missing, C01,C02,C03,C11,C15,C24,C26)
gen mesh_cat=1
local count=1
foreach c of local code {
	local count=`count'+1
	gen mc_`c'=0
 	replace mc_`c'=1 if mesh_aux=="`c'"
 	replace mesh_cat=`count' if mesh_aux=="`c'"
}
drop mesh_aux

save "${data}${final}data_selection_functions_sec.dta", replace


*------------------------------------*
* Section 2 -- Estimate and Tabulate *
*------------------------------------*

use "${data}${final}data_selection_functions_sec.dta",clear

gen SPLIT=top10rev

gen aux=1 if D0==1 & z_modifier!=0
count if aux==1
local N=r(N)
sort aux z
forvalues j=1(1)`N' {
if z_modifier[`j']==-1 {
qui sum z if z<=z[`j'] & z_modifier==0 ,d
qui replace z=r(mean) in `j'
}
if z_modifier[`j']==1 {
qui sum z if z>=z[`j'] & z_modifier==0, d
qui replace z=r(mean) in `j'
}
}
drop aux


sum matched if INDUSTRY==1
local mean_IND=r(mean)
sum matched if INDUSTRY==1 & SPLIT==0
local mean_small=r(mean)
sum matched if INDUSTRY==1 & SPLIT==1
local mean_top10=r(mean)

gen z_Ph2=D0*z

local covariates z_Ph2 D1 D2 sqrt_enrollment placebo cy_* mc_*
local path "${run}${table}tabS6.xls"
local stderr "cl mesh_cat"

logit matched `covariates' if INDUSTRY==1, vce(`stderr')
unique nct_id if INDUSTRY==1
outreg2 using "`path'", replace adds("Mean dep. var.",`mean_IND', "No. of trials" ,r(sum)) ct("All","industry") nocons keep(z_Ph2 D1 D2) addtext(Controls,yes,Completion Year FE,yes,Mesh Condition FE,yes)
logit matched `covariates' if INDUSTRY==1 & SPLIT==0, vce(`stderr')
unique nct_id if INDUSTRY==1 & SPLIT==0
outreg2 using "`path'", append adds("Mean dep. var.",`mean_small', "No. of trials" ,r(sum)) ct("Small","industry") nocons keep(z_Ph2 D1 D2) addtext(Controls,yes,Completion Year FE,yes,Mesh Condition FE,yes)
logit matched `covariates' if INDUSTRY==1 & SPLIT==1, vce(`stderr')
unique nct_id if INDUSTRY==1 & SPLIT==1 
outreg2 using "`path'", append adds("Mean dep. var.",`mean_top10', "No. of trials" ,r(sum)) ct("Top 10","industry") nocons keep(z_Ph2 D1 D2) addtext(Controls,yes,Completion Year FE,yes,Mesh Condition FE,yes)


log close
cd "${run}${dofiles}" 
